Estatísticas Resumidas (summarize)

library("nycflights13")
library("tidyverse")
flights %>% summarize(media_distance=mean(distance))
flights %>% summarize(
  media_distance = mean(distance),
  mediana_distance = median(distance),
  sd_distance = sd(distance)
)
estatisticas <- flights %>% summarize(
  media_distance = mean(distance),
  mediana_distance = median(distance),
  sd_distance = sd(distance)
)

media_distance <- estatisticas %>% pull(media_distance)

"A distância média dos voos é 1039.9126036.

flights %>% summarize(dep_delay=mean(dep_delay,na.rm=TRUE))
percentile_90_10 <- function(variavel) {
  calculo <- quantile(variavel, probs = 0.9, na.rm = TRUE) /
    quantile(variavel, probs = 0.1, na.rm = TRUE)
  
  return(calculo)
}
flights %>% summarize(percentile_90_10_distance=percentile_90_10(distance),
                      percentile_90_10_air_time=percentile_90_10(air_time))

Grupos (group_by)

flights_por_aeroporto <- flights %>% group_by(origin)
flights %>% group_by(origin) %>% 
  summarize(mean_distance=mean(distance))
flights %>% group_by(origin, month) %>% 
  summarize(mean_distance=mean(distance))
flights %>% group_by(origin, month) %>% 
  summarize(mean_distance=mean(distance)) %>%
  filter(origin!="LGA") %>%
  mutate(mean_distance_km=mean_distance*1.60934)
flights %>% group_by(origin) %>%
  top_n(1,dep_delay)
rstudioapi::navigateToFile("aula_4_exercicios.Rmd")

Número do observações por Grupo (tally)

flights %>% group_by(origin) %>% 
  tally()
flights %>% group_by(origin, dest) %>% 
  tally()

Mutate por Grupo

flights %>% group_by(origin) %>%
  mutate(media_distance=mean(distance,na.rm=TRUE))

Saindo de Agrupamentos (ungroup)

flights_media <- flights %>% group_by(origin) %>%
  mutate(media_distance=mean(distance,na.rm=TRUE))

groups(flights)
## NULL
groups(flights_media)
## [[1]]
## origin
flights_media %>% summarize(media_atraso=mean(dep_delay,na.rm=T))
flights_media %>% ungroup() %>% 
  summarize(media_atraso=mean(dep_delay,na.rm=T))

Porcentagens

flights %>% 
  mutate(Total_distance=sum(distance,na.rm=TRUE)) %>% 
  mutate(Pct_distance=100*(distance/Total_distance))
flights %>% mutate(Pct_distance=100*(distance/sum(distance,na.rm=TRUE)))
flights %>% group_by(month) %>% 
  mutate(Pct_distance_por_mes=100*(distance/sum(distance,na.rm=TRUE)))
flights %>% group_by(month, day, hour, origin) %>% 
  mutate(Pct_distance_por_mes_hora_origem=100*(distance/sum(distance,na.rm=TRUE)))
flights %>% group_by(origin) %>% 
  tally() %>%
  mutate(Pct_por_aeroporto=n/sum(n))
flights %>% group_by(origin, month) %>% 
  tally() %>%
  group_by(origin) %>% 
  mutate(Pct_por_mes_no_aeroporto=100*(n/sum(n)))
flights %>% group_by(origin, month) %>% 
  tally() %>%
  group_by(month) %>% 
  mutate(Pct_por_mes_no_aeroporto=100*(n/sum(n)))

Habilidade Básica de Programação: Filtros Avançados (%in%)

flights %>% filter(dest=="ILM"|dest=="ACK"|dest=="GRR"|dest=="PSP") %>%
  mutate(Pct_distance=100*(distance/sum(distance,na.rm=TRUE)))
flights %>% filter(dest %in% c("ILM", "ACK", "GRR", "PSP")) %>%
  mutate(Pct_distance=100*(distance/sum(distance,na.rm=TRUE)))

Resumos e Transformações de Múltiplas Colunas (summarize_all)

flights %>% summarize_all(mean,na.rm=TRUE)

Transformações de Colunas específicas (mutate_all, mutate_at, mutate_if)

flights %>% mutate_at(vars(year, month, day, dep_time, sched_dep_time, 
    dep_delay, arr_time, sched_arr_time, arr_delay, flight, air_time, distance, 
    hour, minute), scale)
flights %>% mutate_if(is.numeric, scale)